In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import plotly.express as px
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from itertools import combinations
Load the data¶
In [2]:
churn = pd.read_csv('df2_encoded.csv')
df = pd.DataFrame(churn)
#df.head(100)
# gathering descriptive statistics
df.describe(include='all')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1129 entries, 0 to 1128 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 stag 1129 non-null float64 1 event 1129 non-null int64 2 gender 1129 non-null int64 3 age 1129 non-null float64 4 industry 1129 non-null int64 5 profession 1129 non-null int64 6 traffic 1129 non-null int64 7 coach 1129 non-null int64 8 head_gender 1129 non-null int64 9 greywage 1129 non-null int64 10 way 1129 non-null int64 11 extraversion 1129 non-null float64 12 independ 1129 non-null float64 13 selfcontrol 1129 non-null float64 14 anxiety 1129 non-null float64 15 novator 1129 non-null float64 dtypes: float64(7), int64(9) memory usage: 141.3 KB
In [3]:
df.shape
Out[3]:
(1129, 16)
In [4]:
df.columns
Out[4]:
Index(['stag', 'event', 'gender', 'age', 'industry', 'profession', 'traffic',
'coach', 'head_gender', 'greywage', 'way', 'extraversion', 'independ',
'selfcontrol', 'anxiety', 'novator'],
dtype='object')
Modeling the Data¶
In [5]:
# looking for imbalance in the data set
df['event'].value_counts()
Out[5]:
event 1 571 0 558 Name: count, dtype: int64
In [6]:
# class imbalance percentage
print("The percentage of majority class is:", round((len(df[df['event']==1])/df.shape[0])* 100, 2),'%')
The percentage of majority class is: 50.58 %
- I will be using a logistic regression algorithm to model the data. For 'event', we have 50.6 % left the company and 49.4 % decided to stay. Therefore,we have a balanced class.
In [7]:
# dropping the duplicates and save the dataset in a new variable
#df_new = df.drop_duplicates()
#df_new.reset_index(inplace=True, drop=True)
#df_new.head(10)
Data Visualizations¶
In [8]:
df.hist(figsize=(14,9), xrot=45)
plt.show
Out[8]:
<function matplotlib.pyplot.show(close=None, block=None)>
Correlation Matrix and Heatmap¶
In [9]:
# creating a correlation matrix
corr=df.corr()
#corr
# heatmap of the correlation matrix
df_corr = df.corr().round(2)
fig = px.imshow(df_corr, text_auto = True, labels=dict(color="Correlation"), width=600, height=600)
fig.show()
- From the correlations alone, we see very weak correlation to 'event' as all columns have very weak to no linear correlations, meaning we cant really extract much information.
- Lets check whats the distribution of employee that left the company or not
In [10]:
fig = px.pie(df, "event", color='event', hole=.3)
fig.show()
- Checking if experience (time) is a factor that affects employee from resigning
In [11]:
fig = px.histogram(df, x="stag", color='event', marginal='box', barmode='group')
fig.show()
- Checking if age is a factor that affects employees from leaving the company
In [12]:
fig = px.histogram(df, x="age", color='event', marginal='box', barmode='group')
fig.show()
- Age does not have strong correaltion to employee resigning
A boxplot and a regplot of employee retention based on the anxiety level.¶
In [13]:
fig, axes =plt.subplots(1,2, figsize=(12,4))
sns.boxplot(x='event',y='anxiety', showfliers=False, data=df, ax=axes[0])
axes[0].set_xlabel("Employee resigned from the company or not")
axes[0].set_ylabel("Anxiety level")
axes[0].set_title("Boxplot for Anxiety Level vs employee left the company or not")
sns.regplot(x='event',y='anxiety', data=df, ax=axes[1])
axes[1].set_xlabel("Employee left the company or not")
axes[1].set_ylabel("Anxiety level")
axes[1].set_title("Linear Regression Model Fit")
plt.show()
In [14]:
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'stag' and 'event' are columns in df
if 'stag' in df.columns:
# df['stag'] = df['stag'] / 12
df['stag'] = df['stag']
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Boxplot for 'stag'
sns.boxplot(x='event', y='stag', showfliers=False, data=df, ax=axes[0])
axes[0].set_xlabel("Employee resigned from the company or not")
axes[0].set_ylabel("selfcontrol")
axes[0].set_title("Boxplot for stag vs employee left the company or not")
# Linear regression plot for 'stag/12'
sns.regplot(x='event', y='stag', data=df, ax=axes[1])
axes[1].set_xlabel("Employee left the company or not")
axes[1].set_ylabel("stag [year]")
axes[1].set_title("Linear Regression Model Fit")
plt.show()
- From the boxplot we can see the employees who've left has a lower tenure than the employees who've stayed. The employees who work in the company longer are more likely to stay.
In [15]:
sns.countplot(data=df, x='profession', hue ='event')
plt.xlabel("profession")
plt.title("profession vs. If the employee left the company")
plt.legend(title = "Employee left?")
plt.xticks(rotation = 90)
plt.show()
Preparing Data for Modelling¶
- Our variable is 'event' i.e. we are trying to find out whether or not an
employee will leave the company, based on other independent variables. From the heatmap we can see, the correlation with other variable is weak.
- The dependent variable 'event' is binary i.e. it has only two results 0 (if the employee didn't leave) and 1 (if the employee left).
In [16]:
# 'event' is the target variable that we want to predict
target = df['event']
# Drop the target variable from the features
features = df.drop('event', axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
In [17]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')
X_train : (903, 15) y_train : (903,) X_test : (226, 15) y_test : (226,)
Initialize the Logistic Regression Model- No scaling¶
In [18]:
# Initialize the LogReg classifier
logreg1 = LogisticRegression(max_iter = 1000, random_state = 42)
# Fit the model on the training data
logreg1.fit(X_train, y_train)
# Make predictions on the training data
train_predictions = logreg1.predict(X_train)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train, train_predictions)
# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)
plt.figure(figsize=(2, 1))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Get the classification report for the training data
class_report_train = classification_report(y_train, train_predictions)
print("Classification Report for Training Data:")
print(class_report_train)
# If you want to see the accuracy score for the training data
accuracy_score_train = logreg1.score(X_train, y_train)
print(f"Accuracy on Training Data: {accuracy_score_train}")
# If you want to see the accuracy score for the testing data
accuracy_score_test= logreg1.score(X_test, y_test)
print(f"Accuracy on Testing Data: {accuracy_score_test}")
Confusion Matrix: [[255 195] [185 268]]
Classification Report for Training Data:
precision recall f1-score support
0 0.58 0.57 0.57 450
1 0.58 0.59 0.59 453
accuracy 0.58 903
macro avg 0.58 0.58 0.58 903
weighted avg 0.58 0.58 0.58 903
Accuracy on Training Data: 0.5791805094130675
Accuracy on Testing Data: 0.5575221238938053
Using Traning data, with Standard Scaling¶
In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.fit_transform(X_test)
# Initialize the KNN classifier
logreg2 = LogisticRegression(max_iter = 1000, random_state = 42)
# Fit the model on the training data
logreg2.fit(X_train2, y_train)
# Make predictions on the training data
train_predictions2 = logreg2.predict(X_train2)
# Calculate the confusion matrix
conf_matrix2 = confusion_matrix(y_train, train_predictions2)
# Print the confusion matrix
print("Confusion Matrix2:")
print(conf_matrix2)
# Get the classification report for the training data
class_report_train2 = classification_report(y_train, train_predictions2)
print("Classification Report for Training Data:")
print(class_report_train2)
# If you want to see the accuracy score for the training data
accuracy_score_train2 = logreg2.score(X_train2, y_train)
print(f"Accuracy on Training Data: {accuracy_score_train2}")
# If you want to see the accuracy score for the testing data
accuracy_score_test2= logreg2.score(X_test2, y_test)
print(f"Accuracy on Testing Data: {accuracy_score_test2}")
Confusion Matrix2:
[[255 195]
[186 267]]
Classification Report for Training Data:
precision recall f1-score support
0 0.58 0.57 0.57 450
1 0.58 0.59 0.58 453
accuracy 0.58 903
macro avg 0.58 0.58 0.58 903
weighted avg 0.58 0.58 0.58 903
Accuracy on Training Data: 0.5780730897009967
Accuracy on Testing Data: 0.5530973451327433
Using only with Traning data, with MinMax Scaling¶
In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train3 = scaler.fit_transform(X_train)
X_test3 = scaler.fit_transform(X_test)
# Initialize the KNN classifier
logreg3 = LogisticRegression(max_iter = 1000, random_state = 42)
# Fit the model on the training data
logreg3.fit(X_train3, y_train)
# Make predictions on the training data
train_predictions3 = logreg3.predict(X_train3)
# Calculate the confusion matrix
conf_matrix3 = confusion_matrix(y_train, train_predictions3)
# Print the confusion matrix
print("Confusion Matrix3:")
print(conf_matrix3)
# Get the classification report for the training data
class_report_train3 = classification_report(y_train, train_predictions3)
print("Classification Report for Training Data:")
print(class_report_train3)
# If you want to see the accuracy score for the training data
accuracy_score_train3 = logreg3.score(X_train3, y_train)
print(f"Accuracy on Training Data: {accuracy_score_train3}")
# If you want to see the accuracy score for the testing data
accuracy_score_test3= logreg3.score(X_test3, y_test)
print(f"Accuracy on Testing Data: {accuracy_score_test3}")
Confusion Matrix3:
[[264 186]
[188 265]]
Classification Report for Training Data:
precision recall f1-score support
0 0.58 0.59 0.59 450
1 0.59 0.58 0.59 453
accuracy 0.59 903
macro avg 0.59 0.59 0.59 903
weighted avg 0.59 0.59 0.59 903
Accuracy on Training Data: 0.5858250276854928
Accuracy on Testing Data: 0.5575221238938053
Start with Cross_Val_Score (3 folder is best) with MinMaxScaler¶
In [21]:
from sklearn.model_selection import cross_val_score
logreg4 = LogisticRegression(max_iter = 1000, random_state = 42)
for i in range(2, 11):
scores = cross_val_score(logreg4, X_train3, y_train, cv=i, scoring='accuracy') #with Standard Scaling
mean_score = scores.mean()
std_dev = scores.std()
print(f"Mean accuracy for {i} folds: {mean_score:.3f}")
print(f"Standard deviation for {i} folds: {std_dev:.3f}")
print('*'*50)
Mean accuracy for 2 folds: 0.540 Standard deviation for 2 folds: 0.007 ************************************************** Mean accuracy for 3 folds: 0.555 Standard deviation for 3 folds: 0.003 ************************************************** Mean accuracy for 4 folds: 0.553 Standard deviation for 4 folds: 0.010 ************************************************** Mean accuracy for 5 folds: 0.552 Standard deviation for 5 folds: 0.038 ************************************************** Mean accuracy for 6 folds: 0.546 Standard deviation for 6 folds: 0.011 ************************************************** Mean accuracy for 7 folds: 0.538 Standard deviation for 7 folds: 0.045 ************************************************** Mean accuracy for 8 folds: 0.543 Standard deviation for 8 folds: 0.034 ************************************************** Mean accuracy for 9 folds: 0.544 Standard deviation for 9 folds: 0.042 ************************************************** Mean accuracy for 10 folds: 0.550 Standard deviation for 10 folds: 0.058 **************************************************
In [ ]:
- here start with Cross_Val_Score with MinMax Scaling
Use with GridSearchCV with Standard Scaling¶
In [22]:
param_grid = {
'C': [ 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']}
# Instantiate logistic regression model
logistic_model = LogisticRegression(max_iter = 1000, random_state = 42)
# Create GridSearchCV object
grid_search = GridSearchCV(logistic_model, param_grid, cv =3,
scoring = ['accuracy', 'recall', 'precision', 'f1'], refit = 'recall')
logistic_model
Out[22]:
LogisticRegression(max_iter=1000, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, random_state=42)
In [23]:
import warnings
warnings.filterwarnings('ignore', module='sklearn')
# Fit the grid search to the data
grid_search.fit(X_train3, y_train)
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)
# Get the best model
logistic_model_best_estimater = grid_search.best_estimator_
Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
In [24]:
logistic_model_best_estimater
Out[24]:
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')
In [25]:
logistic_model_best_estimater.fit(X_train3,y_train)
Out[25]:
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')
In [26]:
print (f'Train Accuracy - : {logistic_model_best_estimater.score(X_train3, y_train):.3f}') #Mit Sklaierung
print (f'Test Accuracy - : {logistic_model_best_estimater.score(X_test3,y_test):.3f}') # Mit Sklaierun
Train Accuracy - : 0.586 Test Accuracy - : 0.558
- Feature Improtance
- A function is defined below to gererate a sorted coefficient table.
In [27]:
# Write a function to generate coefficient table
def get_coefficient_table(model, training_set):
'''
Generate a coefficient table for a regression model.
Parameters:
- model (object): The trained model for which coefficients are to be analyzed.
- training_set (DataFrame): The DataFrame containing the features used for training the model.
Returns:
DataFrame: A table containing feature names and their corresponding coefficients.
The table is sorted by the magnitude (absolute value) of the coefficients in descending order.
'''
# Create coefficient table
coefficient_table = pd.DataFrame(training_set.columns, columns = ['Features']).copy()
coefficient_table.insert(len(coefficient_table.columns), "Coefficients", model.coef_.transpose())
# Sort coefficient table by magnitude (absolute value) of the coefficients
coefficient_table['Absolute_Coefs'] = coefficient_table['Coefficients'].abs()
sorted_coefficient_table = coefficient_table.sort_values(by = 'Absolute_Coefs', ascending=False, ignore_index=True)
sorted_coefficient_table = sorted_coefficient_table.drop(columns = ['Absolute_Coefs'])
return sorted_coefficient_table
- Now we can have a coefficient table sorted by the magnitude of each feature's impact on employee turnover.
In [28]:
# Create a coefficient table from the selected model
get_coefficient_table(logistic_model_best_estimater, X_train)
Out[28]:
| Features | Coefficients | |
|---|---|---|
| 0 | age | -1.034868 |
| 1 | anxiety | -0.740016 |
| 2 | stag | -0.509231 |
| 3 | way | 0.398189 |
| 4 | traffic | 0.394109 |
| 5 | selfcontrol | -0.309708 |
| 6 | independ | 0.309617 |
| 7 | greywage | -0.296845 |
| 8 | head_gender | 0.217827 |
| 9 | coach | -0.176564 |
| 10 | profession | -0.153280 |
| 11 | extraversion | -0.107094 |
| 12 | industry | 0.044168 |
| 13 | novator | 0.040276 |
| 14 | gender | 0.039417 |
- The table shows that key features influencing employee turnover include age, anxiety, stag, way, traffic. Additionally, self control independent and greywage.
In [ ]: